
/******************************************************************************
 *
 *  This file is part of canu, a software program that assembles whole-genome
 *  sequencing reads into contigs.
 *
 *  This software is based on:
 *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
 *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
 *
 *  Except as indicated otherwise, this is a 'United States Government Work',
 *  and is released in the public domain.
 *
 *  File 'README.licenses' in the root directory of this distribution
 *  contains full conditions and disclaimers.
 */

#include "findErrors.H"

//  Set  delta  to the entries indicating the insertions/deletions
//  in the alignment encoded in  edit_array  ending at position
//  edit_array[e][d].  row  is the position in the first
//  string where the alignment ended.  Set  (*delta_len)  to
//  the number of entries in  delta .

static
void
Compute_Delta(pedWorkArea_t   *WA,
              int32            e,
              int32            d,
              int32            row) {
  int32  last     = row;
  int32  stackLen = 0;

  for (int32 k=e; k>0; k--) {
    int32  from = d;
    int32  max  = 1 + WA->Edit_Array_Lazy[k-1][d];

    int32  j = WA->Edit_Array_Lazy[k-1][d-1];

    if (j > max) {
      from = d-1;
      max = j;
    }

    j = 1 + WA->Edit_Array_Lazy[k-1][d+1];

    if (j > max) {
      from = d+1;
      max = j;
    }

    if (from == d-1) {
      WA->deltaStack[stackLen++] = max - last - 1;
      d--;
      last = WA->Edit_Array_Lazy[k-1][from];
    }

    else if (from == d+1) {
      WA->deltaStack[stackLen++] = last - (max - 1);
      d++;
      last = WA->Edit_Array_Lazy[k-1][from];
    }
  }

  WA->deltaStack[stackLen++] = last + 1;

  for (int32 k=0, i=stackLen-1; i>0; i--)
    WA->delta[k++] = abs(WA->deltaStack[i]) * Sign(WA->deltaStack[i-1]);

  WA->deltaLen = stackLen - 1;
}





//  Allocate another block of 64mb for edits

//  Needs to be at least:
//       52,432 to handle 40% error at  64k overlap
//      104,860 to handle 80% error at  64k overlap
//      209,718 to handle 40% error at 256k overlap
//      419,434 to handle 80% error at 256k overlap
//    3,355,446 to handle 40% error at   4m overlap
//    6,710,890 to handle 80% error at   4m overlap
//  Bigger means we can assign more than one Edit_Array[] in one allocation.

uint32  EDIT_SPACE_SIZE  = 16 * 1024 * 1024;

static
void
Allocate_More_Edit_Space(pedWorkArea_t *WA) {

  //  Determine the last allocated block, and the last assigned block

  int32  b = 0;                 //  Last edit array assigned
  int32  e = 0;                 //  Last edit array assigned more space
  int32  a = WA->alloc.size();  //  First unallocated block

  while (WA->Edit_Array_Lazy[b] != NULL)
    b++;

  //  Fill in the edit space array.  Well, not quite yet.  First, decide the minimum size.
  //
  //  Element [0] can access from [-2] to [2] = 5 elements.
  //  Element [1] can access from [-3] to [3] = 7 elements.
  //
  //  Element [e] can access from [-2-e] to [2+e] = 5 + e * 2 elements
  //
  //  So, our offset for this new block needs to put [e][0] at offset...

  int32 Offset = 2 + b;
  int32 Del    = 6 + b * 2;
  int32 Size   = EDIT_SPACE_SIZE;

  while (Size < Offset + Del)
    Size *= 2;

  //  Allocate another block

  int32  *alloc = new int32 [Size];

  WA->alloc.push_back(alloc);

  //  And, now, fill in the edit space array.

  e = b;

  while ((Offset + Del < Size) &&
         (e < WA->Edit_Array_Max)) {
    WA->Edit_Array_Lazy[e++] = alloc + Offset;

    Offset += Del;
    Del    += 2;
  }

  if (e == b)
    fprintf(stderr, "Allocate_More_Edit_Space()-- ERROR: couldn't allocate enough space for even one more entry!  e=%d\n", e);
  assert(e != b);

  //fprintf(stderr, "WorkArea %d allocates space %d of size %d for array %d through %d\n", WA->thread_id, a, Size, b, e-1);
}





//  Return the minimum number of changes (inserts, deletes, replacements)
//  needed to match string  A[0 .. (m-1)]  with a prefix of string
//   T[0 .. (n-1)]  if it's not more than  Error_Limit .
//
//  Put delta description of alignment in  WA->delta  and set
//  WA->deltaLen to the number of entries there if it's a complete
//  match.
//  Set  A_End  and  T_End  to the rightmost positions where the
//  alignment ended in  A  and  T , respectively.
//  Set  Match_To_End  true if the match extended to the end
//  of at least one string; otherwise, set it false to indicate
//  a branch point.

int32
Prefix_Edit_Dist(char   *A, int32 m,
                 char   *T, int32 n,
                 int32   Error_Limit,
                 int32  &A_End,
                 int32  &T_End,
                 bool   &Match_To_End,
                 pedWorkArea_t *WA) {

  //assert (m <= n);

  int32 Best_d  = 0;
  int32 Best_e  = 0;
  int32 Longest = 0;

  WA->deltaLen = 0;

  int32 shorter = std::min(m, n);

  int32 Row = 0;
  while ((Row < shorter) && (A[Row] == T[Row]))
    Row++;

  if (WA->Edit_Array_Lazy[0] == NULL)
    Allocate_More_Edit_Space(WA);

  WA->Edit_Array_Lazy[0][0] = Row;

  // Exact match?
  if (Row == shorter) {
    A_End        = Row;
    T_End        = Row;
    Match_To_End = true;
    return(0);
  }

  int32  Left = 0;
  int32  Right = 0;
  double Max_Score = 0.0;
  int32  Max_Score_Len = 0;
  int32  Max_Score_Best_d = 0;
  int32  Max_Score_Best_e = 0;

  for (int32 e=1; e<=Error_Limit; e++) {
    if (WA->Edit_Array_Lazy[e] == NULL)
      Allocate_More_Edit_Space(WA);

    Left  = std::max(Left  - 1, -e);
    Right = std::min(Right + 1,  e);

    WA->Edit_Array_Lazy[e-1][Left]    = -2;
    WA->Edit_Array_Lazy[e-1][Left-1]  = -2;
    WA->Edit_Array_Lazy[e-1][Right]   = -2;
    WA->Edit_Array_Lazy[e-1][Right+1] = -2;

    for (int32 d=Left; d<=Right; d++) {
      Row = 1 + WA->Edit_Array_Lazy[e-1][d];
      Row = std::max(Row, WA->Edit_Array_Lazy[e-1][d-1]);
      Row = std::max(Row, WA->Edit_Array_Lazy[e-1][d+1] + 1);

      while ((Row < m) && (Row + d < n) && (A[Row] == T[Row + d]))
        Row++;

      assert(e < WA->Edit_Array_Max);

      WA->Edit_Array_Lazy[e][d] = Row;

      if (Row == m || Row + d == n) {

        // Force last error to be mismatch rather than insertion
        if ((Row == m) &&
            (1 + WA->Edit_Array_Lazy[e-1][d+1] == WA->Edit_Array_Lazy[e][d]) &&
            (d < Right)) {
          d++;
          WA->Edit_Array_Lazy[e][d] = WA->Edit_Array_Lazy[e][d-1];
        }

        A_End        = Row;           // One past last align position
        T_End        = Row + d;
        Match_To_End = true;

        Compute_Delta(WA, e, d, Row);

        return(e);
      }
    }

    while (Left <= Right && Left < 0
           && WA->Edit_Array_Lazy[e][Left] < WA->G->Edit_Match_Limit[e])
      Left++;

    if (Left >= 0)
      while (Left <= Right
             && WA->Edit_Array_Lazy[e][Left] + Left < WA->G->Edit_Match_Limit[e])
        Left++;

    if (Left > Right)
      break;

    while (Right > 0
           && WA->Edit_Array_Lazy[e][Right] + Right < WA->G->Edit_Match_Limit[e])
      Right--;

    if (Right <= 0)
      while (WA->Edit_Array_Lazy[e][Right] < WA->G->Edit_Match_Limit[e])
        Right--;

    assert (Left <= Right);

    for (int32 d=Left;  d <= Right;  d++)
      if (WA->Edit_Array_Lazy[e][d] > Longest) {
        Best_d  = d;
        Best_e  = e;
        Longest = WA->Edit_Array_Lazy[e][d];
      }

    int32  Score = Longest * BRANCH_PT_MATCH_VALUE - e;

    // Assumes  BRANCH_PT_MATCH_VALUE - BRANCH_PT_ERROR_VALUE == 1.0

    //  CorrectOverlaps didn't have the second clause.
    //  Neither did overlapper.

    if ((Score > Max_Score) &&
        (Best_e <= WA->G->Error_Bound[std::min(Longest, Longest + Best_d)])) {
      Max_Score        = Score;
      Max_Score_Len    = Longest;
      Max_Score_Best_d = Best_d;
      Max_Score_Best_e = Best_e;
    }
  }

  //  CorrectOverlaps doesn't have this call
  Compute_Delta(WA, Max_Score_Best_e, Max_Score_Best_d, Max_Score_Len);

  A_End        = Max_Score_Len;
  T_End        = Max_Score_Len + Max_Score_Best_d;
  Match_To_End = false;

  //  CorrectOverlaps was returning just 'e', not the best as below.
  //  It used this return value to compute the new error rate.

  return(Max_Score_Best_e);
}
